package com.huahua.bigdata.spark.rdd.operate;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;

import java.util.Arrays;

public class Spark10_Operate_Transform_WordCount {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf();
        conf.setMaster("local");
        conf.setAppName("spark");
        JavaSparkContext jsc = new JavaSparkContext(conf);

        // TODO 分组聚合
        //      1. 读取文件
        JavaRDD<String> lineRdd = jsc.textFile("data/word.txt");
        //      2. 将文件数据进行分解
        JavaRDD<String> wordRDD = lineRdd.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
        //      3. 将相同的单词分到一个组中
        JavaPairRDD<String, Iterable<String>> wordGropRDD = wordRDD.groupBy(word -> word);
        //      4. 计算每个单词的组中的数量即可
        JavaPairRDD<String, Integer> wordCountRDD = wordGropRDD.mapValues(
                iter -> {
                    int len = 0;
                    for (String s : iter) {
                        len++;
                    }
                    return len;
                }
        );

        wordCountRDD.collect().forEach(System.out::println);

        jsc.close();
    }
}
